Process each sorted sample
pathway <- "/Users/rhalenathomas/Documents/Data/scRNAseq/PhenoID/scRNAseqSorted/objs/"
Neurons1 <- readRDS(paste(pathway,"seu1.rds",sep = ""))
Neurons2 <- readRDS(paste(pathway,"seu2.rds",sep = ""))
Glia1 <- readRDS(paste(pathway,"seu3.rds",sep = ""))
Glia2 <- readRDS(paste(pathway,"seu4.rds",sep = ""))
Neurons1
An object of class Seurat
33541 features across 3952 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Neurons2
An object of class Seurat
33541 features across 34830 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Glia1 #-Astrocytes
An object of class Seurat
33541 features across 54723 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Glia2 #-Radial Glia
An object of class Seurat
33541 features across 10338 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Check QC
library(Seurat)
VlnPlot(Neurons1, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(Neurons1, pt.size = 0.10, features = c("nFeature_RNA"), y.max = 1000)
Warning: Removed 555 rows containing non-finite values (stat_ydensity).
Warning: Removed 555 rows containing missing values (geom_point).
VlnPlot(Neurons2, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(Glia1, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
VlnPlot(Glia2, pt.size = 0.10, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
NA
NA
NA
Filter
Neuron1.ft <- subset(Neurons1, subset = nFeature_RNA > 250 & nCount_RNA < 10000)
Neuron2.ft <- subset(Neurons2, subset = nFeature_RNA > 350 & nCount_RNA > 250 & nCount_RNA < 10000)
Glia1.ft <- subset(Glia1, subset = nFeature_RNA > 500 & nCount_RNA > 500 & nCount_RNA < 10000)
Glia2.ft <- subset(Glia2, subset = nFeature_RNA > 300 & nCount_RNA > 500 & nCount_RNA < 10000)
Neuron1.ft
An object of class Seurat
33541 features across 1833 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Neuron2.ft
An object of class Seurat
33541 features across 10162 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Glia1.ft
An object of class Seurat
33541 features across 37813 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Glia2.ft
An object of class Seurat
33541 features across 5000 samples within 2 assays
Active assay: RNA (33538 features, 0 variable features)
1 other assay present: HTO
Remove doublets from each Sorted population
suppressMessages(require(DoubletFinder))
# Neurons1
seu.d <- Neuron1.ft
seu.d <- NormalizeData(seu.d)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 20)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
# starts with 1833 cells
nExp <- round(ncol(seu.d) * 0.013) # expect close to 1.3doublets
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
[1] "Creating 611 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|=========================================== | 50%
|
|======================================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
[1] 33538 1809
Neurons1 <- seu.d
# Neurons2
seu.d <- Neuron2.ft # starting 10162 cells
seu.d <- NormalizeData(seu.d)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 20)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
nExp <- round(ncol(seu.d) * 0.076) # expect 7.6% doublets for
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
[1] "Creating 3387 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|=========================================== | 50%
|
|======================================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
[1] 33538 9390
Neurons2 <- seu.d
Astrocytes has too many cells Demultiplex with hashtags
DefaultAssay(Glia1.ft) <- 'HTO'
ht <- HTODemux(Glia1.ft, positive.quantile = 0.98)
Warning in PseudobulkExpression(object = object, pb.method = "average", :
Exponentiation yielded infinite values. `data` may not be log-normed.
Cutoff for B0251-TotalSeqB : 78 reads
Cutoff for B0252-TotalSeqB : 92 reads
Cutoff for B0256-TotalSeqB : 1 reads
table(ht$HTO_classification.global)
Doublet Negative Singlet
3812 25323 8678
table(ht$HTO_classification)
B0251-TotalSeqB B0251-TotalSeqB_B0251-TotalSeqB B0251-TotalSeqB_B0252-TotalSeqB
4654 46 3766
B0252-TotalSeqB Negative
4024 25323
table(ht$hash.ID)
Doublet B0252-TotalSeqB Negative B0251-TotalSeqB
3812 4024 25323 4654
RidgePlot(ht, assay = "HTO", group.by = "HTO_maxID", features = rownames(ht[["HTO"]]), log = TRUE)
Scale for 'x' is already present. Adding another scale for 'x', which will replace the
existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which will replace the
existing scale.
Scale for 'x' is already present. Adding another scale for 'x', which will replace the
existing scale.
Picking joint bandwidth of 0.028
Picking joint bandwidth of 0.0257
Picking joint bandwidth of 0.000306
# perhaps I put the wrong hashtags in the compute canada pipeline
VlnPlot(ht, assay = "RNA", group.by = 'hash.ID', features = "nFeature_RNA")
# one of the replicates isn't represented labelled or not recognize well
# I'll select the two reps with clear hashtags
Idents(ht) <- 'hash.ID'
glia1.temp <- subset(ht, idents = c("B0252-TotalSeqB","B0251-TotalSeqB"))
dim(glia1.temp)
[1] 3 8678
In this case some doublets will already be removed
# Glia 1 astrocytes 8678 cells
DefaultAssay(glia1.temp) <- "RNA"
seu.d <- glia1.temp
seu.d <- NormalizeData(seu.d)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 20)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
nExp <- round(ncol(seu.d) * 0.064) # expect 6.4% doublets input of 13000 cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
[1] "Creating 2893 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|=========================================== | 50%
|
|======================================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
[1] 33538 8123
Astrocytes <- seu.d
Glia2 - Radial Glia, low CD44 in the FACS sort.
seu.d <- Glia2.ft # 5000 cells
seu.d <- NormalizeData(seu.d)
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
seu.d = FindVariableFeatures(seu.d, verbose = F)
seu.d = ScaleData(seu.d, vars.to.regress = c("nFeature_RNA", "percent.mt"),
verbose = F)
seu.d = RunPCA(seu.d, verbose = F, npcs = 20)
seu.d = RunUMAP(seu.d, dims = 1:10, verbose = F)
nExp <- round(ncol(seu.d) * 0.039) # expect 6% doublets 5000 cells
seu.d <- doubletFinder_v3(seu.d, pN = 0.25, pK = 0.09, nExp = nExp, PCs = 1:10)
[1] "Creating 1667 artificial doublets..."
[1] "Creating Seurat object..."
[1] "Normalizing Seurat object..."
Performing log-normalization
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Finding variable genes..."
Calculating gene variances
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
Calculating feature variances of standardized and clipped values
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
[1] "Scaling data..."
Centering and scaling data matrix
|
| | 0%
|
|=========================================== | 50%
|
|======================================================================================| 100%
[1] "Running PCA..."
[1] "Calculating PC distance matrix..."
[1] "Computing pANN..."
[1] "Classifying doublets.."
# name of the DF prediction can change, so extract the correct column name.
DF.name = colnames(seu.d@meta.data)[grepl("DF.classification", colnames(seu.d@meta.data))]
seu.d <- seu.d[, seu.d@meta.data[, DF.name]== "Singlet"]
dim(seu.d)
[1] 33538 4805
RadialGlia <- seu.d
Annotate the cell types in each Flow population
Rename Idents to prepare to merge
unique(sorted.merge$orig.ident)
[1] "Neurons1" "Neurons2" "Astroctyes" "RadialGlia"
Cluster merge object
seu <- RunUMAP(seu, dim = 1:18, n.neighbors = 43)
21:59:45 UMAP embedding parameters a = 0.9922 b = 1.112
21:59:45 Read 7809 rows and found 18 numeric columns
21:59:45 Using Annoy for neighbor search, n_neighbors = 43
21:59:45 Building Annoy index with metric = cosine, n_trees = 50
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
21:59:45 Writing NN index file to temp file /var/folders/k4/khtkczkd5tn732ftjpwgtr240000gn/T//Rtmp0GH9mR/file1304393f0b75
21:59:46 Searching Annoy index using 1 thread, search_k = 4300
21:59:48 Annoy recall = 100%
21:59:48 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 43
21:59:49 Initializing from normalized Laplacian + noise (using irlba)
21:59:49 Commencing optimization for 500 epochs, with 480674 positive edges
Using method 'umap'
0% 10 20 30 40 50 60 70 80 90 100%
[----|----|----|----|----|----|----|----|----|----|
**************************************************|
22:00:03 Optimization finished
DimPlot(seu, reduction = "umap")
Quick annotation from previously annotated
NOTE TO ME *******
I will need to look through the pre - filtering cells - neurons 1 is the same
Possible just down sample astro so the numbers are the same. The other samples were randomly downsampled. Maybe if I repeat the down sampling with different seeds I’ll get results more similar. Also if I cluster before down sampling like I did the first time.
Calculate proprotion in object in the figures:
table(seu.L$orig.ident)
Neurons1 Neurons2 Astrocytes RadialGlia
1723 2000 3000 2000
table(seu.L$orig.ident, seu.L$Cell_Subtype_Markers)
Astrocytes-HPD Neurons-ASCL1 RadialGlia-RPL41 Neurons-SPARCL1 Astrocytes-COL3A1
Neurons1 8 2 10 564 0
Neurons2 72 784 172 0 5
Astrocytes 1535 11 48 0 747
RadialGlia 175 57 587 0 207
Neurons-TFPI1 RadialGlia-PTN Neurons-CP Astrocytes-FABP5 Neurons-MGP
Neurons1 5 112 333 5 0
Neurons2 302 53 0 3 237
Astrocytes 55 45 0 324 72
RadialGlia 53 441 0 1 17
RadialGlia-NEAT1 DANeurons-RAB3B Neurons-GRIA2 Mix NPC Neurons-TPH1
Neurons1 4 131 245 15 140 4
Neurons2 24 76 0 84 6 105
Astrocytes 35 0 0 81 3 4
RadialGlia 241 52 0 48 5 20
DANeurons-TPBG DANeurons-TTR RadialGlia-CY1B1 RadialGlia-TOP2A RadialGlia-VCAN
Neurons1 10 65 57 12 1
Neurons2 50 20 0 7 0
Astrocytes 7 0 0 18 15
RadialGlia 46 19 0 8 23
Figure 6 F
pdf(paste(output_path,"BarChartProportionCellTypesin4popsNov2.pdf"), height = 5, width = 8)
ggplot(df, aes(x = FACpop, y = Freq, fill = Cell_subtypes)) +
geom_col(position = "fill") + theme_classic() +
scale_fill_manual(values = clust.colours) +
#scale_x_discrete(expand = c(0, 0)) +
scale_y_continuous(expand = c(0,0)) +
theme(axis.text.x = element_text(angle = 90, size = 16), axis.text.y = element_text(size = 16)) +
labs(y = "Proportion of Cells", x = "") +
theme(axis.title.y = element_text(size = 16), legend.text = element_text(size = 16), legend.title = element_text(size = 16))
dev.off()
null device
1